## This script uses HILDA to get the population, average incomes and wealth of the starting cohorts in the simulation.
## Based on 2018 HILDA data (latest available wealth data). 


# Preliminaries -----------------------------------------------------------

rm(list=ls())
gc()

options(scipen=999)


## custom smoother function
source("./R scripts/Data and parameters/0 Custom functions.R")


# Read HILDA grouped master ---------------------------------------------------------------

## filter to wave 18 for starting values
hilda_grouped_master <- qread("./Input data/Intermediate input data/hilda_grouped_master.qs") %>% 
  filter(wavenumber==18)



# Create new variables ----------------------------------------------------

## number of income and wealth quantiles
wlth_grps <- 4


hilda_grouped <- hilda_grouped_master %>% 
  
  ## dealing with a few outliers
  mutate(
    ## one outlier with -250000 other income - replace other inc with second lowest other inc (160k) (this will keep the total population number correct in cohort when compared with removing the obs)
    total_inc = ifelse(other_inc < -200000, total_inc+200000-160000, total_inc), 
    other_inc = ifelse(other_inc < -200000, -160000, other_inc),
    ## 9 outliers are children with relatively high wealth because they own housing. replace their housing wealth with 0, and remove housing debt from debt
    total_assets = ifelse(hgage < 15 & housing_assets>0, total_assets-housing_assets, total_assets),
    housing_assets = ifelse(hgage < 15 & housing_assets>0, 0, housing_assets),
    housing_debt = ifelse(hgage < 15 & housing_debt>0, 0, housing_debt)
  ) %>% 
  
  ## NB not modelling other debt
  mutate(
    total_wealth = total_assets - housing_debt
  ) %>% 
  
  ## calc some grouping variables for starting values - wealth group (inc group already calced in hilda_grouped_master and unaffected by outlier adjustment)
  ## create cohort variables for potential bequest recipient, home owner
  mutate(
    ## weighted quantile of wealth
    total_wealth_qtile = cut(total_wealth, 
                             ## cut points determined by weighted quantile (only works if cut points are unique)
                             breaks = Hmisc::wtd.quantile(.$total_wealth, 
                                                          weights = .$hhwte, 
                                                          probs = seq(0, 1, 1/wlth_grps)), 
                             include.lowest=T,
                             labels = c(1:wlth_grps),
                             ordered_result=T),
    
    ## id bequest received yet or not (by parent status) and parent age
    bequest_received = ifelse(is.na(parent_age), 1, 0 ),  ## 1 = already received bequest, can't receive again
    
    ## homeowner
    homeowner = ifelse(housing_assets>0, 1, 0)
  )

## Save this as hilda_grouped_starting_cohort for use later eg bequest receipt probability
qsave(hilda_grouped, "./Input data/Intermediate input data/hilda_grouped_starting_cohort.qs")
hilda_grouped <- qread("./Input data/Intermediate input data/hilda_grouped_starting_cohort.qs")


# Starting population -----------------------------------------------------

## Starting population in each cohort -- incl parent age grp
starting_n <- hilda_grouped %>% 
  ## number of people with parents in each parent age group
  group_by(age_grp, age_grp2, total_inc_qtile, total_wealth_qtile, bequest_received, homeowner, parent_age_grp) %>% 
  summarise(n_parent = sum(hhwte)) %>% 
  ## number of people
  group_by(age_grp, age_grp2, total_inc_qtile, total_wealth_qtile, bequest_received, homeowner) %>% 
  mutate(n = sum(n_parent)) %>% 
  arrange(parent_age_grp) %>% ## order by parent age grp
  ## pivot wide along parent age grp
  mutate(parent_age_grp = paste0("parent_age0_n_", parent_age_grp)) %>% 
  pivot_wider(names_from = parent_age_grp, values_from=n_parent) %>% 
  select(-parent_age0_n_NA)  ## NA parent ages we assume have already received bequest

## IN APPENDIX: About one quarter of the 2018 model population were deemed to have received an inheritance
starting_n %>% group_by(bequest_received) %>% summarise(n=sum(n)) %>% mutate(prop = n/sum(n))


## *** APPENDIX FIGURES C.3
starting_n_age_inc_plot <- ggplot(starting_n) +
  geom_col(aes(x = age_grp, y=n/1000, fill=total_inc_qtile, colour=total_inc_qtile)) +
  scale_x_discrete(breaks=unique(starting_n$age_grp %>% sort)[seq(1, 21, 4)],
                   labels = unique(starting_n$age_grp %>% sort)[seq(1, 21, 4)] %>% age_grp_labeller()) +
  ylab("Population ('000)") +
  xlab("Age group") +
  scale_fill_pc(labels = c("Q1", "Q2", "Q3", "Q4", "Q5")) +
  scale_colour_pc(labels = c("Q1", "Q2", "Q3", "Q4", "Q5")) +
  custom_plot_margin

emf(file=paste0("./Charts/starting_n_age_inc_plot", Sys.Date(), ".emf"), 
    width = 7.5/2.54, height = 7/2.54,
    pointsize=12,
    family="Arial")
starting_n_age_inc_plot
dev.off()

ggsave(file=paste0("./Charts/starting_n_age_inc_plot", Sys.Date(), ".svg"), 
       plot=  starting_n_age_inc_plot, 
       width = 7.5/2.54, height = 7/2.54)


starting_n_age_wealth_plot <- ggplot(starting_n) +
  geom_col(aes(x = age_grp, y=n/1000, fill=total_wealth_qtile, colour=total_wealth_qtile)) +
  scale_x_discrete(breaks=unique(starting_n$age_grp %>% sort)[seq(1, 21, 4)],
                   labels = unique(starting_n$age_grp %>% sort)[seq(1, 21, 4)] %>% age_grp_labeller()) +
  ylab("Population ('000)") +
  xlab("Age group") +
  scale_fill_pc(labels = c("Q1", "Q2", "Q3", "Q4")) +
  scale_colour_pc(labels = c("Q1", "Q2", "Q3", "Q4")) +
  custom_plot_margin

emf(file=paste0("./Charts/starting_n_age_wealth_plot", Sys.Date(), ".emf"), 
    width = 7.5/2.54, height = 7/2.54,
    pointsize=12,
    family="Arial")
starting_n_age_wealth_plot
dev.off()

ggsave(file=paste0("./Charts/starting_n_age_wealth_plot", Sys.Date(), ".svg"), 
       plot=  starting_n_age_wealth_plot, 
       width = 7.5/2.54, height = 7/2.54)


starting_n_age_ho_plot <- ggplot(starting_n) +
  geom_col(aes(x = age_grp, y=n/1000, fill=as.factor(homeowner), colour=as.factor(homeowner))) +
  scale_x_discrete(breaks=unique(starting_n$age_grp %>% sort)[seq(1, 21, 4)],
                   labels = unique(starting_n$age_grp %>% sort)[seq(1, 21, 4)] %>% age_grp_labeller()) +
  ylab("Population ('000)") +
  xlab("Age group") +
  scale_fill_pc(labels = c("Non-homeowner", "Homeowner")) +
  scale_colour_pc(labels = c("Non-homeowner", "Homeowner")) +
  custom_plot_margin

emf(file=paste0("./Charts/starting_n_age_ho_plot", Sys.Date(), ".emf"), 
    width = 7.5/2.54, height = 7/2.54,
    pointsize=12,
    family="Arial")
starting_n_age_ho_plot
dev.off()

ggsave(file=paste0("./Charts/starting_n_age_ho_plot", Sys.Date(), ".svg"), 
       plot=  starting_n_age_ho_plot, 
       width = 7.5/2.54, height = 7/2.54)


starting_n_age_beqrec_plot <- ggplot(starting_n) +
  geom_col(aes(x = age_grp, y=n/1000, fill=as.factor(bequest_received), colour=as.factor(bequest_received))) +
  scale_x_discrete(breaks=unique(starting_n$age_grp %>% sort)[seq(1, 21, 4)],
                   labels = unique(starting_n$age_grp %>% sort)[seq(1, 21, 4)] %>% age_grp_labeller()) +
  ylab("Population ('000)") +
  xlab("Age group") +
  scale_fill_pc(labels = c("Not received", "Received")) +
  scale_colour_pc(labels = c("Not received", "Received")) +
  custom_plot_margin

emf(file=paste0("./Charts/starting_n_age_beqrec_plot", Sys.Date(), ".emf"), 
    width = 7.5/2.54, height = 7/2.54,
    pointsize=12,
    family="Arial")
starting_n_age_beqrec_plot
dev.off()

ggsave(file=paste0("./Charts/starting_n_age_beqrec_plot", Sys.Date(), ".svg"), 
       plot=  starting_n_age_beqrec_plot, 
       width = 7.5/2.54, height = 7/2.54)

#write.csv(starting_n %>% group_by(age_grp, homeowner) %>% summarise(n=sum(n)) %>% mutate(age_grp = age_grp_labeller(age_grp)), "temp.csv")


# Starting income ---------------------------------------------------------

## Get average incomes for each age and income cohort - grouping very young and very old (age_group2)
## Initial analysis comparing differences across wealth groups or bequest recipients do not reveal huge differences

starting_inc <- hilda_grouped %>% 
  group_by(age_grp2, total_inc_qtile) %>% 
  ## summarise averages
  mutate(
    n_age_grp2 = sum(hhwte), 
    sample_freq_age_grp2 = n(),
    
    wages_av = wtd.mean(wsfes, hhwte),
    other_inc_av = wtd.mean(other_inc, hhwte)
  ) %>% 
  ungroup %>% 
  distinct(age_grp2, total_inc_qtile, n_age_grp2, sample_freq_age_grp2, wages_av, other_inc_av) %>% 
  ## smooth out average incomes across age groups using custom function
  arrange(total_inc_qtile, age_grp2) %>% 
  group_by(total_inc_qtile) %>% 
  mutate(across(contains("av"),  ~custom_smoother(.x) , .names="{.col}_smooth"))

## plot - Unused
# starting_inc_plot_data <- starting_inc %>%
#   select(age_grp2, total_inc_qtile, wages_av_smooth, other_inc_av_smooth) %>%
#   distinct %>%
#   pivot_longer(contains("av"), names_to = "inc_type", values_to = "value")
# 
# starting_inc_plot <- ggplot(starting_inc_plot_data ) +
#   geom_col(aes(x = age_grp2, y = value, fill=inc_type)) +
#   facet_grid(vars(total_inc_qtile) , scales="free")



# Starting ASSETS and DEBT ---------------------------------------------------------

## Get average wealth for each age, wealth and homeowner cohort 

starting_wealth <- hilda_grouped %>% 
  group_by(age_grp2,  total_wealth_qtile, homeowner) %>% 
  ## summarise averages
  summarise(
    n = sum(hhwte), 
    sample_freq = n(),
    
    housing_assets_av = ifelse(sample_freq>1, wtd.mean(housing_assets, hhwte), housing_assets[1]),
    super_assets_av = ifelse(sample_freq>1, wtd.mean(super_assets, hhwte), super_assets[1]),
    other_assets_av = ifelse(sample_freq>1, wtd.mean(other_assets, hhwte), other_assets[1]),
    housing_debt_av = ifelse(sample_freq>1, wtd.mean(housing_debt, hhwte), housing_debt[1])
  ) %>% 
  arrange( total_wealth_qtile, homeowner, age_grp2) %>% 
  group_by( total_wealth_qtile, homeowner) %>% 
  ## smooth out average wealth across age groups 
  mutate(across(contains("av"),  ~custom_smoother(.x) , .names="{.col}_smooth"))



## plot by wealth - Unused
# starting_wealth_plot_data <- starting_wealth %>%
#   select(age_grp2, total_wealth_qtile, homeowner, contains("smooth"), sample_freq) %>%
#   mutate(housing_debt_av_smooth = -housing_debt_av_smooth) %>%
#   distinct %>%
#   pivot_longer(contains("smooth"), names_to = "wealth_type", values_to = "value")
# 
# starting_wealth_plot_ho0 <- ggplot(starting_wealth_plot_data %>% filter(homeowner==0)) +
#   geom_col(aes(x = age_grp2, y = value, fill=wealth_type)) +
#   facet_grid( vars(total_wealth_qtile)  , scales="free")
# 
# starting_wealth_plot_ho1 <- ggplot(starting_wealth_plot_data %>% filter(homeowner==1)) +
#   geom_col(aes(x = age_grp2, y = value, fill=wealth_type)) +
#   facet_grid( vars(total_wealth_qtile)  , scales="free")



# Merge starting value data -----------------------------------------------

starting_cohorts <- left_join(starting_n %>% select(age_grp, age_grp2, total_inc_qtile, total_wealth_qtile, bequest_received, homeowner, n, contains("parent_age0_n_")), 
                              starting_inc %>% select(age_grp2, total_inc_qtile, contains("smooth"))) %>% 
  left_join(starting_wealth %>% select(age_grp2, total_wealth_qtile, homeowner, sample_freq, contains("smooth"))) %>% 
  ## remove n=0 (0 weight for that group)
  filter(n!=0)

qsave(starting_cohorts %>% ungroup, "./Input data/starting_cohorts_aiwbh_p.qs") 